Overview

There is mainly two parts to this story:

  1. A simple easy model of the key relationship

  2. A medium complexity smoothing analysis

The two data set used in this analysis are the Madison case and waste water concentration data.

##           Date    Site Cases    N1 N1Error
## 435 2021-06-19 Madison     3    NA      NA
## 436 2021-06-20 Madison     8 49443    8812
## 437 2021-06-21 Madison     1 90447   20429
## 438 2021-06-22 Madison     3 44587   12427
## 439 2021-06-23 Madison     4 14469    9155
## 440 2021-06-24 Madison    NA 28023    7724

A simple display of the data shows the core components of this story. First that both data sets are extremely noisy. And that there is a hint of a relationship between the two signals.

FirstImpression <- FullDF%>%
  NoNa("N1","Cases")%>%#Removing outliers
  ggplot(aes(x=Date))+#Data depends on time
  geom_line(aes(y=MinMaxNormalization(N1), color="N1",info=N1))+#compares N1 to Cases
  geom_line(aes(y=MinMaxNormalization(Cases), color="Cases",info=Cases))+
  labs(y="variable min max normalized")+
  ColorRule

ggplotly(FirstImpression,tooltip=c("info","Date"))

Cross correlation and Granger Causality are key component to this analysis. Cross correlation looks at the correlation at a range of time shifts and Granger analysis preform a test for predictive power. We find that there is to much noise to find significance.

TestDF1 <- FullDF%>%
  FillNA("N1","Cases")%>%#filling in missing data in range with previous values
  NoNa("N1","Cases")#removing rows from before both series started#filling in missing data in range with previous values


Cases <- TestDF1$Cases
N1 <- TestDF1$N1

ccf(Cases,N1,na.action=na.pass)

grangertest(Cases, N1, order = 1)
## Granger causality test
## 
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(Cases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
##   Res.Df Df     F Pr(>F)
## 1    279                
## 2    280 -1 2.623 0.1065
grangertest(N1,Cases, order = 1)
## Granger causality test
## 
## Model 1: Cases ~ Lags(Cases, 1:1) + Lags(N1, 1:1)
## Model 2: Cases ~ Lags(Cases, 1:1)
##   Res.Df Df      F Pr(>F)
## 1    279                 
## 2    280 -1 0.1124 0.7377

From a first pass it is clear that the waste water measurements before 11/20/2020 did not function as an effective measure of the amount of waste water shed in the community. So for this analysis we are removing waste water data from before that point. Also there are some extreme outliers that we remove for more effective analysis

IntermediateOutlierGraphic <- TRUE #

DaySmoothed=21#Very wide smoothing to find where the data strong deviate from trend

FullDF2 <- FullDF%>%#Remove older data that clearly has no relationship to Cases
  mutate(N1 = ifelse(Date < mdy("11/20/2020"),NA,N1))

FullDF3 <- FullDF2%>%
  mutate(SmoothN1=rollapply(data = N1, width = DaySmoothed, FUN = median, 
                            na.r = TRUE,fill=NA),#Finding very smooth version of the data with no outliers
         SmoothN1=ifelse(is.na(SmoothN1),N1,SmoothN1),#Fixing issue where rollapply fills NA on right border
         LargeError=N1>5*SmoothN1,#Calculating error Limits
         N1=ifelse(LargeError,SmoothN1,N1))%>%#replacing data points that variance is to large
  select(-SmoothN1,-LargeError)#Removing unneeded calculated columns

if(IntermediateOutlierGraphic){
  OutlierGraphic <- FullDF%>%
    mutate(SmoothN1=rollapply(data = N1, width = DaySmoothed, FUN = median, 
                            na.r = TRUE,fill=NA),#creating smooth data
           SmoothN1=ifelse(is.na(SmoothN1),N1,SmoothN1))%>%#Fixing issue where rollapply fills NA on right border)%>%
    mutate(Outliers=Date < mdy("11/20/2020")|N1>5*SmoothN1)%>%
    NoNa("N1","Cases")%>%#Removing outliers
    ggplot(aes(x=Date))+#Data depends on time
    geom_point(aes(y=MinMaxNormalization(Cases), color="Cases",info=Cases))+
    geom_point(aes(y=MinMaxNormalization(N1), color="N1",shape=Outliers,info=N1))+#compares N1 to Cases
    labs(y="variable min max normalized")
  ggplotly(OutlierGraphic,tooltip=c("info","Date"))
}
Ref <- FullDF3%>%
  NoNa("N1","Cases")%>%#Removing outliers
  ggplot(aes(x=Date))+#Data depends on time
  geom_line(aes(y=MinMaxNormalization(N1), color="N1", info = N1))+#compares N1 to Cases
  geom_line(aes(y=MinMaxNormalization(Cases), color="Cases",info = Cases))+
  labs(y="variable min max normalized")+
  ColorRule

ggplotly(Ref,tooltip=c("info","Date"))

We now find a signifigent relationship. However there is some danger of doing this kind of tests on non statinary time series

TestDF2 <- FullDF3%>%
  FillNA("N1","Cases")%>%#filling in missing data in range with previous values
  NoNa("N1","Cases")#removing rows from before both series started


Cases <- TestDF2$Cases
N1 <- TestDF2$N1

#library(tseries) Tests for stationarity. potential use is obvious
# kpss.test(Cases)
# adf.test(Cases)
# kpss.test(N1)
# adf.test(N1)

ccf(Cases,N1,na.action=na.pass)

grangertest(Cases, N1, order = 1)
## Granger causality test
## 
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(Cases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
##   Res.Df Df     F    Pr(>F)    
## 1    213                       
## 2    214 -1 29.12 1.806e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
grangertest(N1,Cases, order = 1)
## Granger causality test
## 
## Model 1: Cases ~ Lags(Cases, 1:1) + Lags(N1, 1:1)
## Model 2: Cases ~ Lags(Cases, 1:1)
##   Res.Df Df      F  Pr(>F)  
## 1    213                    
## 2    214 -1 4.9769 0.02673 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

A key component to this relationship is that the relationship between N1 and Case involves a gamma distribution modeling both the time between catching Covid-19 and getting a test and the concentration of the shedded particles. We found a gamma distribution with mean 11.73 days and a standard deviation of 7.68 match’s other research and gives good results.

SLDWidth <- 21
scale  <- 5.028338
shape  <- 2.332779 #These parameters are equivalent to the mean and sd above

weights <- dgamma(1:SLDWidth, scale = scale, shape = shape)
plot(weights,  
            main=paste("Gamma Distribution with mean = 11.73 days,and SD = 7.68"), 
            ylab = "Weight", 
            xlab = "Lag")

SLDSmoothedDF <- FullDF3%>%
  mutate(
    SLDCases = c(rep(NA,SLDWidth-1),#elimination of starting values not relevant as we have a 50+ day buffer of case data
                        rollapply(Cases,width=SLDWidth,FUN=weighted.mean,
                                  w=weights,
                                  na.rm = FALSE)))#no missing data to remove


SLDPlot = SLDSmoothedDF%>%
  NoNa("N1","SLDCases")%>%#same plot as earlier but with the SLD smoothing
  ggplot(aes(x=Date))+
  geom_line(aes(y=MinMaxNormalization(SLDCases), color="SLDCases",info = SLDCases))+
  geom_line(aes(y=MinMaxNormalization(N1), color="N1",info = N1))+
  facet_wrap(~Site)+
  labs(y="variable min max normalized")+
  ColorRule

ggplotly(SLDPlot,tooltip=c("info","Date"))

The SLD improves the shape of the CCF. However it removes the signifigence of N1 predicting Cases. This makes sense looking at the data.

TestDF3 <- SLDSmoothedDF%>%
  FillNA("N1","SLDCases")%>%#filling in missing data in range with previous values
  NoNa("N1","SLDCases")#removing rows from before both series started

SLDCases <- TestDF3$SLDCases
N1 <- TestDF3$N1

ccf(SLDCases,N1,na.action=na.pass)

grangertest(SLDCases, N1, order = 1)
## Granger causality test
## 
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(SLDCases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
##   Res.Df Df      F    Pr(>F)    
## 1    213                        
## 2    214 -1 23.794 2.097e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
grangertest(N1,SLDCases, order = 1)
## Granger causality test
## 
## Model 1: SLDCases ~ Lags(SLDCases, 1:1) + Lags(N1, 1:1)
## Model 2: SLDCases ~ Lags(SLDCases, 1:1)
##   Res.Df Df      F  Pr(>F)  
## 1    213                    
## 2    214 -1 5.3844 0.02126 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

To isolate this relationship we used a primitive binning relationship. This clarifies the relationship we see hints of in the previous graphic and masks the noise in the data.

StartDate <- 1 #Where the binning starts
DaySmoothing <- 14 #The size of the bins
Lag <- 7 #The offset between Cases and N1

BinDF <- SLDSmoothedDF%>%
  select(Date, Cases, N1)%>%
    mutate(MovedCases = data.table::shift(Cases, Lag),#moving  SLD lag days backwards
           Week=(as.numeric(Date)+StartDate)%/%DaySmoothing)%>%#putting variables into bins via integer division
  group_by(Week)%>%
  #filter(Week>2670)%>%
  summarise(BinnedCases=median(MovedCases, na.rm=TRUE), BinnedN1=(median((N1), na.rm=TRUE)), Date = mean(Date,na.rm = TRUE))#summarize data within bins.



BinedGraph <- BinDF%>%
  NoNa("BinnedN1","BinnedCases")%>%#Remove NA
  ggplot(aes(x=Date))+
  geom_line(aes(y=MinMaxNormalization(BinnedN1), color="N1" , info = BinnedN1))+
  geom_line(aes(y=MinMaxNormalization(BinnedCases), color="Cases", info = BinnedCases))+
  labs(y="Binned variable min max normalized")+
  ColorRule

ggplotly(BinedGraph,tooltip=c("info","Date"))
BinedGraph

BinedCorrGraph <- BinDF%>%
  ggplot()+
  geom_point(aes(x=BinnedCases, y=BinnedN1, info = Date))
ggplotly(BinedCorrGraph,tooltip=c("x","y","info"))
cor(BinDF$BinnedN1, BinDF$BinnedCases, use="pairwise.complete.obs")
## [1] 0.892832
summary(lm(BinnedCases~BinnedN1, data=BinDF))
## 
## Call:
## lm(formula = BinnedCases ~ BinnedN1, data = BinDF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.739 -36.261   3.998  30.477  72.814 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.098e+01  2.008e+01  -2.041   0.0593 .  
## BinnedN1     8.237e-04  1.073e-04   7.678 1.42e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.43 on 15 degrees of freedom
##   (16 observations deleted due to missingness)
## Multiple R-squared:  0.7971, Adjusted R-squared:  0.7836 
## F-statistic: 58.95 on 1 and 15 DF,  p-value: 1.424e-06

To generate this relationship without reducing the amount of data we rely on a loess smoothing of the data. The loess smoothing is a way of generating smooth curves from noisy data. The displayed plot shows the visual power of this smoothing. We see a relationship in the big patterns but also multiple sub patterns match. We see in general that smoothed N1 both lags and leads the case data.

SLDSmoothedDF$loessN1 <- loessFit(y=(SLDSmoothedDF$N1), 
                      x=SLDSmoothedDF$Date, #create loess fit of the data
                      span=.2, #span of .2 seems to give the best result, not rigorously chosen
                      iterations=2)$fitted#2 iterations remove some bad patterns

SLDLoessGraphic <- SLDSmoothedDF%>%
  NoNa("loessN1","SLDCases")%>%
  ggplot(aes(x=Date))+
  geom_line(aes(y=MinMaxNormalization(loessN1), color="loessN1" , info = loessN1))+
  geom_line(aes(y=MinMaxNormalization(SLDCases), color="SLDCases" , info = SLDCases))+
  facet_wrap(~Site)+
  labs(y="variable min max normalized")+
  ColorRule


ggplotly(SLDLoessGraphic,tooltip=c("info","Date"))

The loess smoothing gives the best ccf relation and shape. It also insignifigently changes granger tests

TestDF4 <- SLDSmoothedDF%>%
  FillNA("loessN1","SLDCases")%>%#filling in missing data in range with previous values
  NoNa("loessN1","SLDCases")#removing rows from before both series started

SLDCases <- TestDF4$SLDCases
N1 <- TestDF4$loessN1

ccf(SLDCases,N1,na.action=na.pass)

grangertest(SLDCases, N1, order = 1)
## Granger causality test
## 
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(SLDCases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
##   Res.Df Df      F Pr(>F)
## 1    213                 
## 2    214 -1 0.5606 0.4549
grangertest(N1,SLDCases, order = 1)
## Granger causality test
## 
## Model 1: SLDCases ~ Lags(SLDCases, 1:1) + Lags(N1, 1:1)
## Model 2: SLDCases ~ Lags(SLDCases, 1:1)
##   Res.Df Df      F  Pr(>F)  
## 1    213                    
## 2    214 -1 4.7834 0.02982 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1